library(dplyr)
library(tidyr)
library(Rtsne)
library(pals)
library(umap)
sortedcpg<-readRDS(file="hnsc_cpg1000_sorted.RDS")
set.seed(123)
cpgtsne<-Rtsne(sortedcpg, perplexity=15, exaggeration_factor=5)$Y
par(pty="s")
plot(cpgtsne, xlab= "TSNE dim 1", ylab="TSNE dim 2")
From the above, it appears there are about 5 clusters, maybe 6.
Dimension reduction using UMAP:
myumapsetting<-umap.defaults
myumapsetting$spread<-4 #used during automatic estimation of a/b parameters
myumapsetting$min_dist<-0.5 #must be smaller than "spread"; how close points appear in final layout
# UMAP
set.seed(123)
cpgumap<-umap(sortedcpg, myumapsetting)
par(pty="s")
plot(cpgumap[["layout"]], xlab="UMAP dimension 1", ylab="UMAP dimension 2")
library(Mercator)
## Loading required package: Thresher
## Loading required package: ClassDiscovery
## Loading required package: cluster
## Loading required package: oompaBase
## Loading required package: PCDimension
library(clValid)
## Warning: package 'clValid' was built under R version 4.0.5
clValid can compare internal validation metrics between Kmeans, hierarchical, and pam clustering for different values of K.
myclmethods<-c("hierarchical", "kmeans", "pam")
clvalidres<-clValid(sortedcpg, nClust=2:10, clMethods = myclmethods, validation="internal")
clvalidres@measures
## , , hierarchical
##
## 2 3 4 5 6
## Connectivity 8.7869048 14.6448413 14.64484127 56.11071429 56.11071429
## Dunn 0.5333043 0.5333043 0.53330425 0.45304181 0.45304181
## Silhouette 0.1639850 0.1249195 0.09750493 0.08811865 0.08418329
## 7 8 9 10
## Connectivity 56.11071429 56.11071429 73.68452381 76.61349206
## Dunn 0.45304181 0.45304181 0.45304181 0.45304181
## Silhouette 0.07594937 0.06454131 0.05928766 0.03623698
##
## , , kmeans
##
## 2 3 4 5 6
## Connectivity 178.1619048 240.79603175 247.22222222 322.30714286 417.40595238
## Dunn 0.3362252 0.37186285 0.36793614 0.36932335 0.34273851
## Silhouette 0.1000159 0.08897024 0.09247967 0.06231365 0.05457374
## 7 8 9 10
## Connectivity 438.82142857 444.11309524 461.75119048 538.12023810
## Dunn 0.41149264 0.38576992 0.40702368 0.37643895
## Silhouette 0.05559165 0.04462462 0.05092247 0.03689375
##
## , , pam
##
## 2 3 4 5 6
## Connectivity 186.26230159 260.28492063 392.40952381 393.73611111 432.48134921
## Dunn 0.31550735 0.35939076 0.33268764 0.35536614 0.34687141
## Silhouette 0.09474668 0.08354821 0.04424713 0.04944241 0.04574817
## 7 8 9 10
## Connectivity 415.60158730 461.37380952 502.34166667 536.68055556
## Dunn 0.35597820 0.35723111 0.37809569 0.37809569
## Silhouette 0.04371763 0.03718951 0.03237722 0.02776716
plot(clvalidres)
summary(clvalidres)
##
## Clustering Methods:
## hierarchical kmeans pam
##
## Cluster sizes:
## 2 3 4 5 6 7 8 9 10
##
## Validation Measures:
## 2 3 4 5 6 7 8 9 10
##
## hierarchical Connectivity 8.7869 14.6448 14.6448 56.1107 56.1107 56.1107 56.1107 73.6845 76.6135
## Dunn 0.5333 0.5333 0.5333 0.4530 0.4530 0.4530 0.4530 0.4530 0.4530
## Silhouette 0.1640 0.1249 0.0975 0.0881 0.0842 0.0759 0.0645 0.0593 0.0362
## kmeans Connectivity 178.1619 240.7960 247.2222 322.3071 417.4060 438.8214 444.1131 461.7512 538.1202
## Dunn 0.3362 0.3719 0.3679 0.3693 0.3427 0.4115 0.3858 0.4070 0.3764
## Silhouette 0.1000 0.0890 0.0925 0.0623 0.0546 0.0556 0.0446 0.0509 0.0369
## pam Connectivity 186.2623 260.2849 392.4095 393.7361 432.4813 415.6016 461.3738 502.3417 536.6806
## Dunn 0.3155 0.3594 0.3327 0.3554 0.3469 0.3560 0.3572 0.3781 0.3781
## Silhouette 0.0947 0.0835 0.0442 0.0494 0.0457 0.0437 0.0372 0.0324 0.0278
##
## Optimal Scores:
##
## Score Method Clusters
## Connectivity 8.7869 hierarchical 2
## Dunn 0.5333 hierarchical 2
## Silhouette 0.1640 hierarchical 2
Connectivity should be minimized; Dunn index should be maximized; Silhouette width should be maximized.
myK<-seq(from=3, to=6, by=1)
res_kmeans2<-kmeans(x=as.matrix(sortedcpg), centers=2, iter.max = 10, nstart = 1,
algorithm = c("Hartigan-Wong"), trace=FALSE)
mykmeansclusters <-matrix(NA, ncol=length(myK), nrow=nrow(sortedcpg))
for (i in 1:length(myK)){
mykmeansres = kmeans(x=as.matrix(sortedcpg), centers=myK[i],
iter.max = 10, nstart = 1,
algorithm = c("Hartigan-Wong"), trace=FALSE)
mykmeansclusters [,i] <-mykmeansres$cluster
}
colnames(mykmeansclusters)<-paste("Kmeans",myK, sep="_")
res_pam2<-pam(x=LFt, k=2, metric = “euclidean”, stand = FALSE)
mypamclusters <-matrix(NA, ncol=length(myK), nrow=nrow(sortedcpg))
for (i in 1:length(myK)){
mypamres = pam(x=as.matrix(sortedcpg), k=myK[i],
metric = "euclidean", stand = FALSE)
mypamclusters [,i] <-mypamres$clustering
}
colnames(mypamclusters)<-paste("PAM",myK, sep="_")
library(kernlab)
myspeccclusters <-matrix(NA, ncol=length(myK), nrow=nrow(sortedcpg))
for (i in 1:length(myK)){
myspeccres<-specc(as.matrix(sortedcpg), centers=myK[i])
myspeccclusters [,i] <-myspeccres@.Data
}
colnames(myspeccclusters)<-paste("specc",myK, sep="_")
library(mclust)
## Package 'mclust' version 5.4.7
## Type 'citation("mclust")' for citing this R package in publications.
# compute BIC for all covariance structures and up to 9 components
# Gives a glimpse as to which model-component pairs are best for our data according to BIC
myG <- myK
mybic<-mclustBIC(data=sortedcpg, G=myG)
mybic
## Bayesian Information Criterion (BIC):
## EII VII EEI VEI EVI VVI
## 3 -75078.23 -65829.68 -65682.26 -52879.53 -59961.536 -52861.351
## 4 -53979.90 -49013.18 -41976.95 -35383.02 -32933.871 -30840.522
## 5 -45422.72 -36711.45 -32288.82 -21022.99 -12148.528 -10992.574
## 6 -39823.91 -29648.30 -25669.58 -11431.54 -5798.975 -577.452
##
## Top 3 models based on the BIC criterion:
## VVI,6 EVI,6 VVI,5
## -577.452 -5798.975 -10992.574
plot(mybic)
mymodelNames<-c("EII", "VII", "EEI", "VEI", "EVI", "VVI")
matBIC<-as.data.frame(matrix(mybic, ncol=length(mymodelNames), nrow=length(myG)))
colnames(matBIC)<-mymodelNames
rownames(matBIC)<-myG
top<-3
topBIC<-head(sort(mybic, decreasing=TRUE),top)
myrow<-rep(NA, top)
mycol<-rep(NA, top)
for(i in 1:length(topBIC)){
w<-which(topBIC[i]==matBIC, arr.ind = TRUE)
print(w)
myrow[i]<-rownames(matBIC)[w[1,1]]
mycol[i]<-colnames(matBIC)[w[1,2]]
}
## row col
## 6 4 6
## row col
## 6 4 5
## row col
## 5 3 6
mymclusters<-matrix(NA, nrow=nrow(sortedcpg), ncol=top)
for (i in 1:top){
resmclust<-Mclust(data=sortedcpg, G=myrow[i], modelNames = mycol[i])
mymclusters[,i]<-resmclust$classification
}
colnames(mymclusters)<-paste("mclust", mycol, myrow, sep="_")
head(mymclusters)
## mclust_VVI_6 mclust_EVI_6 mclust_VVI_5
## [1,] 1 1 1
## [2,] 2 2 2
## [3,] 3 5 3
## [4,] 1 1 1
## [5,] 4 4 4
## [6,] 1 1 1
# what if mclust with 2 groups?
resmclust2<-Mclust(data=sortedcpg, G=5, modelNames = "VVI")
par(pty="s")
plot(x=cpgtsne[,1], y=cpgtsne[,2],
bg=alphabet()[resmclust2$classification], pch=21, cex=1.2,
xlab="t-SNE dimension 1",ylab="t-SNE dimension 2")
clustersbig<-cbind(mykmeansclusters, mypamclusters, myspeccclusters,
mymclusters) #just plotting 1
rownames(mykmeansclusters)<-rownames(sortedcpg)
par(pty="s")
for (i in 1:ncol(clustersbig)){
plot(x=cpgtsne[,1], y=cpgtsne[,2],
bg=glasbey()[clustersbig[,i]], pch=21, cex=1.2,
xlab="t-SNE dimension 1",ylab="t-SNE dimension 2",
main=colnames(clustersbig)[i])
}
par(pty="s")
for (i in 1:ncol(clustersbig)){
plot(x=cpgumap$layout[,1], y=cpgumap$layout[,2],
bg=glasbey()[clustersbig[,i]], pch=21, cex=1.2,
xlab="UMAP dimension 1",ylab="UMAP dimension 2",
main=colnames(clustersbig)[i])
}
myK<-seq(from=3, to=6, by=1) #changed this to evaluate only a few
library(Mercator)
myjacc<-binaryDistance(t(sortedcpg), metric="jaccard")
set.seed(123)
myjacctsne<-Rtsne(myjacc, perplexity=15, exaggeration_factor=10)
mysokal<-binaryDistance(t(sortedcpg), metric="sokalMichener")
set.seed(456)
mysokaltsne<-Rtsne(mysokal, perplexity=15, exaggeration_factor=10)
par(mfrow=c(1,2), pty="s")
plot(myjacctsne$Y, xlab="t-SNE dimension 1",ylab="t-SNE dimension 2",
main="TSNE on Jaccard distance")
plot(mysokaltsne$Y, xlab="t-SNE dimension 1",ylab="t-SNE dimension 2",
main="TSNE on Sokal-Michener distance")
Dimension reduction using UMAP:
myumapsetting<-umap.defaults
myumapsetting$spread<-4 #used during automatic estimation of a/b parameters
myumapsetting$min_dist<-0.5 #must be smaller than "spread"; how close points appear in final layout
# UMAP
set.seed(123)
myjaccumap<-umap(as.matrix(myjacc), myumapsetting)
par(pty="s")
plot(myjaccumap[["layout"]], xlab="UMAP dimension 1", ylab="UMAP dimension 2")
mykmeansclusters <-matrix(NA, ncol=length(myK), nrow=nrow(sortedcpg))
for (i in 1:length(myK)){
mykmeansres = kmeans(x=myjacc, centers=myK[i],
iter.max = 10, nstart = 1,
algorithm = c("Hartigan-Wong"), trace=FALSE)
mykmeansclusters [,i] <-mykmeansres$cluster
}
colnames(mykmeansclusters)<-paste("Kmeans",myK, sep="_")
res_pam2<-pam(x=LFt, k=2, metric = “euclidean”, stand = FALSE)
mypamclusters <-matrix(NA, ncol=length(myK), nrow=nrow(sortedcpg))
for (i in 1:length(myK)){
mypamres = pam(x=myjacc, k=myK[i],
metric = "euclidean", stand = FALSE)
mypamclusters [,i] <-mypamres$clustering
}
colnames(mypamclusters)<-paste("PAM",myK, sep="_")
library(kernlab)
myspeccclusters <-matrix(NA, ncol=length(myK), nrow=nrow(sortedcpg))
for (i in 1:length(myK)){
myspeccres<-specc(as.matrix(myjacc), centers=myK[i])
myspeccclusters [,i] <-myspeccres@.Data
}
colnames(myspeccclusters)<-paste("specc",myK, sep="_")
library(mclust)
# compute BIC for all covariance structures and up to 9 components
# Gives a glimpse as to which model-component pairs are best for our data according to BIC
myG <- myK
mybic<-mclustBIC(data=as.matrix(myjacc), G=myG)
mybic
## Bayesian Information Criterion (BIC):
## EII VII EEI VEI EVI VVI
## 3 1008779 1013564 1008635 1015818 1075568 1081559
## 4 1027650 1034404 1028534 1035319 1120088 1111123
## 5 1040425 1045111 1039698 1046146 1148466 1144454
## 6 1044462 1049809 1043184 1050369 1188085 1196575
##
## Top 3 models based on the BIC criterion:
## VVI,6 EVI,6 EVI,5
## 1196575 1188085 1148466
plot(mybic)
mymodelNames<-c("EII", "VII", "EEI", "VEI", "EVI", "VVI")
matBIC<-as.data.frame(matrix(mybic, ncol=length(mymodelNames), nrow=length(myG)))
colnames(matBIC)<-mymodelNames
rownames(matBIC)<-myG
top<-3
topBIC<-head(sort(mybic, decreasing=TRUE),top)
myrow<-rep(NA, top)
mycol<-rep(NA, top)
for(i in 1:length(topBIC)){
w<-which(topBIC[i]==matBIC, arr.ind = TRUE)
print(w)
myrow[i]<-rownames(matBIC)[w[1,1]]
mycol[i]<-colnames(matBIC)[w[1,2]]
}
## row col
## 6 4 6
## row col
## 6 4 5
## row col
## 5 3 5
mymclusters<-matrix(NA, nrow=nrow(sortedcpg), ncol=top)
for (i in 1:top){
resmclust<-Mclust(data=as.matrix(myjacc), G=myrow[i], modelNames = mycol[i])
mymclusters[,i]<-resmclust$classification
}
colnames(mymclusters)<-paste("mclust", mycol, myrow, sep="_")
head(mymclusters)
## mclust_VVI_6 mclust_EVI_6 mclust_EVI_5
## [1,] 1 6 5
## [2,] 2 2 2
## [3,] 3 3 3
## [4,] 4 4 3
## [5,] 1 1 1
## [6,] 1 4 1
clustersbig<-cbind(mykmeansclusters, mypamclusters, myspeccclusters,
mymclusters) #just plotting 1
rownames(mykmeansclusters)<-rownames(sortedcpg)
par(pty="s")
for (i in 1:ncol(clustersbig)){
plot(x=myjacctsne$Y[,1], y=myjacctsne$Y[,2],
bg=glasbey()[clustersbig[,i]], pch=21, cex=1.2,
xlab="t-SNE dimension 1",ylab="t-SNE dimension 2",
main=colnames(clustersbig)[i])
}
par(pty="s")
for (i in 1:ncol(clustersbig)){
plot(x=myjaccumap$layout[,1], y=myjaccumap$layout[,2],
bg=glasbey()[clustersbig[,i]], pch=21, cex=1.2,
xlab="UMAP dimension 1",ylab="UMAP dimension 2",
main=colnames(clustersbig)[i])
}
Save cluster calls with from PAM K=3 using Jaccard distance:
clustersfinal<-clustersbig[,"PAM_3"]
# read in clinical table
sortedclin<-readRDS(file="hnsc_clin_sorted.RDS")
dim(sortedclin)
## [1] 528 79
Identify which columns in sortedclin have ALL NA values and drop them.
isna<-apply(sortedclin, 2, function(x) sum(is.na(x))==nrow(sortedclin))
table(isna)
## isna
## FALSE TRUE
## 74 5
sortedclin<-sortedclin[,-which(isna)] #save over original
numlevels<-apply(sortedclin, 2, function(x) length(unique(x)))
# sort(numlevels)
names(numlevels)[which(numlevels<=10)]
## [1] "patient.histologic_diagnosis"
## [2] "patient.laterality"
## [3] "patient.prospective_collection"
## [4] "patient.retrospective_collection"
## [5] "patient.gender"
## [6] "patient.race"
## [7] "patient.ethnicity"
## [8] "patient.history_other_malignancy"
## [9] "patient.history_neoadjuvant_treatment"
## [10] "patient.lymph_node_neck_dissection_indicator"
## [11] "patient.lymph_node_dissection_method...17"
## [12] "patient.lymph_node_dissection_method...18"
## [13] "patient.lymph_nodes_examined"
## [14] "patient.lymph_nodes_examined_ihc_count"
## [15] "patient.margin_status"
## [16] "patient.egfr_amplification_status"
## [17] "patient.vital_status"
## [18] "patient.tumor_status"
## [19] "patient.ajcc_staging_edition"
## [20] "patient.ajcc_tumor_pathologic_pt"
## [21] "patient.ajcc_nodes_pathologic_pn"
## [22] "patient.ajcc_metastasis_pathologic_pm"
## [23] "patient.ajcc_pathologic_tumor_stage"
## [24] "patient.extracapsular_spread_pathologic"
## [25] "patient.tumor_grade"
## [26] "patient.lymphovascular_invasion"
## [27] "patient.perineural_invasion"
## [28] "patient.hpv_status_p16"
## [29] "patient.hpv_status_ish"
## [30] "patient.tobacco_smoking_history_indicator"
## [31] "patient.alcohol_history_documented"
## [32] "patient.alcohol_consumption_frequency"
## [33] "patient.radiation_treatment_adjuvant"
## [34] "patient.pharmaceutical_tx_adjuvant"
## [35] "patient.treatment_outcome_first_course"
## [36] "patient.new_tumor_event_dx_indicator"
## [37] "patient.clinical_M"
## [38] "patient.clinical_N"
## [39] "patient.clinical_T"
## [40] "patient.clinical_stage"
## [41] "patient.days_to_initial_pathologic_diagnosis"
## [42] "patient.icd_o_3_histology"
## [43] "patient.informed_consent_verified"
## [44] "patient.tumor_tissue_site"
## [45] "fu48.vital_status"
## [46] "fu48.tumor_status"
## [47] "fu48.treatment_outcome_first_course"
## [48] "fu48.tobacco_smokeless_use_at_dx"
## [49] "rad.treatment_best_response"
## [50] "rad.radiation_therapy_site"
## [51] "chemoyes"
interesting<-names(numlevels)[which(numlevels<=10)]
subsetclin<-sortedclin[,interesting]
^We may need to binarize this and plot as shapes on top of the TSNE:
library(dummies)
## dummies-1.5.6 provided by Decision Patterns
clinbinary<-dummy.data.frame(subsetclin)
## Warning in model.matrix.default(~x - 1, model.frame(~x - 1), contrasts = FALSE):
## non-list contrasts argument ignored
## Warning in model.matrix.default(~x - 1, model.frame(~x - 1), contrasts = FALSE):
## non-list contrasts argument ignored
## Warning in model.matrix.default(~x - 1, model.frame(~x - 1), contrasts = FALSE):
## non-list contrasts argument ignored
## Warning in model.matrix.default(~x - 1, model.frame(~x - 1), contrasts = FALSE):
## non-list contrasts argument ignored
## Warning in model.matrix.default(~x - 1, model.frame(~x - 1), contrasts = FALSE):
## non-list contrasts argument ignored
## Warning in model.matrix.default(~x - 1, model.frame(~x - 1), contrasts = FALSE):
## non-list contrasts argument ignored
## Warning in model.matrix.default(~x - 1, model.frame(~x - 1), contrasts = FALSE):
## non-list contrasts argument ignored
## Warning in model.matrix.default(~x - 1, model.frame(~x - 1), contrasts = FALSE):
## non-list contrasts argument ignored
## Warning in model.matrix.default(~x - 1, model.frame(~x - 1), contrasts = FALSE):
## non-list contrasts argument ignored
## Warning in model.matrix.default(~x - 1, model.frame(~x - 1), contrasts = FALSE):
## non-list contrasts argument ignored
## Warning in model.matrix.default(~x - 1, model.frame(~x - 1), contrasts = FALSE):
## non-list contrasts argument ignored
## Warning in model.matrix.default(~x - 1, model.frame(~x - 1), contrasts = FALSE):
## non-list contrasts argument ignored
## Warning in model.matrix.default(~x - 1, model.frame(~x - 1), contrasts = FALSE):
## non-list contrasts argument ignored
## Warning in model.matrix.default(~x - 1, model.frame(~x - 1), contrasts = FALSE):
## non-list contrasts argument ignored
## Warning in model.matrix.default(~x - 1, model.frame(~x - 1), contrasts = FALSE):
## non-list contrasts argument ignored
## Warning in model.matrix.default(~x - 1, model.frame(~x - 1), contrasts = FALSE):
## non-list contrasts argument ignored
## Warning in model.matrix.default(~x - 1, model.frame(~x - 1), contrasts = FALSE):
## non-list contrasts argument ignored
## Warning in model.matrix.default(~x - 1, model.frame(~x - 1), contrasts = FALSE):
## non-list contrasts argument ignored
## Warning in model.matrix.default(~x - 1, model.frame(~x - 1), contrasts = FALSE):
## non-list contrasts argument ignored
## Warning in model.matrix.default(~x - 1, model.frame(~x - 1), contrasts = FALSE):
## non-list contrasts argument ignored
## Warning in model.matrix.default(~x - 1, model.frame(~x - 1), contrasts = FALSE):
## non-list contrasts argument ignored
## Warning in model.matrix.default(~x - 1, model.frame(~x - 1), contrasts = FALSE):
## non-list contrasts argument ignored
## Warning in model.matrix.default(~x - 1, model.frame(~x - 1), contrasts = FALSE):
## non-list contrasts argument ignored
## Warning in model.matrix.default(~x - 1, model.frame(~x - 1), contrasts = FALSE):
## non-list contrasts argument ignored
## Warning in model.matrix.default(~x - 1, model.frame(~x - 1), contrasts = FALSE):
## non-list contrasts argument ignored
## Warning in model.matrix.default(~x - 1, model.frame(~x - 1), contrasts = FALSE):
## non-list contrasts argument ignored
## Warning in model.matrix.default(~x - 1, model.frame(~x - 1), contrasts = FALSE):
## non-list contrasts argument ignored
## Warning in model.matrix.default(~x - 1, model.frame(~x - 1), contrasts = FALSE):
## non-list contrasts argument ignored
## Warning in model.matrix.default(~x - 1, model.frame(~x - 1), contrasts = FALSE):
## non-list contrasts argument ignored
## Warning in model.matrix.default(~x - 1, model.frame(~x - 1), contrasts = FALSE):
## non-list contrasts argument ignored
## Warning in model.matrix.default(~x - 1, model.frame(~x - 1), contrasts = FALSE):
## non-list contrasts argument ignored
## Warning in model.matrix.default(~x - 1, model.frame(~x - 1), contrasts = FALSE):
## non-list contrasts argument ignored
## Warning in model.matrix.default(~x - 1, model.frame(~x - 1), contrasts = FALSE):
## non-list contrasts argument ignored
## Warning in model.matrix.default(~x - 1, model.frame(~x - 1), contrasts = FALSE):
## non-list contrasts argument ignored
## Warning in model.matrix.default(~x - 1, model.frame(~x - 1), contrasts = FALSE):
## non-list contrasts argument ignored
## Warning in model.matrix.default(~x - 1, model.frame(~x - 1), contrasts = FALSE):
## non-list contrasts argument ignored
## Warning in model.matrix.default(~x - 1, model.frame(~x - 1), contrasts = FALSE):
## non-list contrasts argument ignored
## Warning in model.matrix.default(~x - 1, model.frame(~x - 1), contrasts = FALSE):
## non-list contrasts argument ignored
## Warning in model.matrix.default(~x - 1, model.frame(~x - 1), contrasts = FALSE):
## non-list contrasts argument ignored
## Warning in model.matrix.default(~x - 1, model.frame(~x - 1), contrasts = FALSE):
## non-list contrasts argument ignored
## Warning in model.matrix.default(~x - 1, model.frame(~x - 1), contrasts = FALSE):
## non-list contrasts argument ignored
## Warning in model.matrix.default(~x - 1, model.frame(~x - 1), contrasts = FALSE):
## non-list contrasts argument ignored
## Warning in model.matrix.default(~x - 1, model.frame(~x - 1), contrasts = FALSE):
## non-list contrasts argument ignored
## Warning in model.matrix.default(~x - 1, model.frame(~x - 1), contrasts = FALSE):
## non-list contrasts argument ignored
## Warning in model.matrix.default(~x - 1, model.frame(~x - 1), contrasts = FALSE):
## non-list contrasts argument ignored
# only plot columns that have a substantial amount of information
w<-which(colSums(clinbinary)>=100) #528/3 = 176
clinbinlist <- colnames(clinbinary)[w]
clinbinary2 <-clinbinary[,w]
par(pty="s")
for (i in 1:ncol(clinbinary2)){
variable<-colnames(clinbinary2)[i]
plot(myjacctsne$Y, bg=glasbey()[clustersfinal], main=colnames(clinbinary2)[i], pch=21)
points(myjacctsne$Y[clinbinary2[,i]==1,1],
myjacctsne$Y[clinbinary2[,i]==1,2],
lwd=2, col="black", pch=2)
legend("bottomright", legend=colnames(clinbinary2)[i],
col="black", pch=2, cex=0.7)
}
require(survival)
library(survminer)
library(ggpubr)
library(tibble)
plot.ggsurv<- function(surv.fit, clin.table) {
ggsurv<-ggsurvplot(fit =surv.fit, clin.table,
palette = c("#E69F00", "#56B4E9", "#009E73", "#F0E442", "#0072B2", "#D55E00", "#CC79A7"),
pval = TRUE, #conf.int = TRUE,
legend=c(0.7,0.85),legend.title = "Patient Group",
xlab = "Time/days",
ggtheme = theme_light())
ggsurv$plot <- ggsurv$plot +
theme(legend.text = element_text(size = 16, color = "black"),
axis.title.x = element_text(size = 16, color = "black"),
axis.title.y = element_text(size = 16, color = "black"))
return(ggsurv)
}
numlevels<-apply(sortedclin, 2, function(x) length(unique(x)))
# sort(numlevels)
names(numlevels)[which(numlevels<=6)]
## [1] "patient.histologic_diagnosis"
## [2] "patient.laterality"
## [3] "patient.prospective_collection"
## [4] "patient.retrospective_collection"
## [5] "patient.gender"
## [6] "patient.race"
## [7] "patient.ethnicity"
## [8] "patient.history_other_malignancy"
## [9] "patient.history_neoadjuvant_treatment"
## [10] "patient.lymph_node_neck_dissection_indicator"
## [11] "patient.lymph_node_dissection_method...17"
## [12] "patient.lymph_node_dissection_method...18"
## [13] "patient.lymph_nodes_examined"
## [14] "patient.margin_status"
## [15] "patient.egfr_amplification_status"
## [16] "patient.vital_status"
## [17] "patient.tumor_status"
## [18] "patient.ajcc_staging_edition"
## [19] "patient.ajcc_metastasis_pathologic_pm"
## [20] "patient.extracapsular_spread_pathologic"
## [21] "patient.tumor_grade"
## [22] "patient.lymphovascular_invasion"
## [23] "patient.perineural_invasion"
## [24] "patient.hpv_status_p16"
## [25] "patient.hpv_status_ish"
## [26] "patient.tobacco_smoking_history_indicator"
## [27] "patient.alcohol_history_documented"
## [28] "patient.radiation_treatment_adjuvant"
## [29] "patient.pharmaceutical_tx_adjuvant"
## [30] "patient.treatment_outcome_first_course"
## [31] "patient.new_tumor_event_dx_indicator"
## [32] "patient.clinical_M"
## [33] "patient.days_to_initial_pathologic_diagnosis"
## [34] "patient.icd_o_3_histology"
## [35] "patient.informed_consent_verified"
## [36] "patient.tumor_tissue_site"
## [37] "fu48.vital_status"
## [38] "fu48.tumor_status"
## [39] "fu48.treatment_outcome_first_course"
## [40] "fu48.tobacco_smokeless_use_at_dx"
## [41] "rad.treatment_best_response"
## [42] "rad.radiation_therapy_site"
## [43] "chemoyes"
interesting<-names(numlevels)[which(numlevels<=6)]
interestingimportant<-c(interesting, "patient.birth_days_to", "patient.death_days_to")
subsetclin<-sortedclin[,interestingimportant]
clin_sub<-subsetclin %>% mutate(patient.age_year= round(-as.numeric(patient.birth_days_to)/365, 2), patient.event=sortedclin$patient.vital_status %>% as.factor() %>% as.numeric() -1, #Alive=0,Dead=1 )
clin_sub<-subsetclin %>%
mutate(patient.age_year= round(-as.numeric(patient.birth_days_to)/365, 2),
patient.event=sortedclin$patient.vital_status %>% as.factor() %>% as.numeric() -1, #Alive=0,Dead=1
patient.survDays=rowSums(sortedclin[,
c("patient.last_contact_days_to","patient.death_days_to")], na.rm=T))
summary(clin_sub$patient.age_year)
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 20.00 53.35 61.14 61.44 69.17 90.06 1
There is one patient that has an unknown birth date that needs to be dropped:
nobirthday<-which(is.na(clin_sub$patient.birth_days_to))
clin_sub1<-clin_sub[-nobirthday,]
clin_sub1$patient.age_year %>% summary()
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 20.00 53.35 61.14 61.44 69.17 90.06
firstq<-summary(clin_sub1$patient.age_year)[2]
secondq<-summary(clin_sub1$patient.age_year)[4]
thirdq<-summary(clin_sub1$patient.age_year)[5]
clin_sub2<-clin_sub1 %>%
mutate(age_group = ifelse(patient.age_year>=secondq, "Above_median","Below_median"),
age_group1 = case_when(patient.age_year <=firstq ~ "1st quantile",
patient.age_year >firstq & patient.age_year <=secondq ~ "2nd quantile",
patient.age_year >secondq & patient.age_year<=thirdq ~ "3rd quantile",
patient.age_year > thirdq ~ "4th quantile"))
fit_age<- survfit(Surv(patient.survDays, patient.event) ~ age_group1, clin_sub2)
plot.ggsurv(fit_age,clin_sub2)
fit_clusters<- survfit(Surv(patient.survDays, patient.event) ~ clustersfinal, clin_sub)
plot.ggsurv(fit_clusters,clin_sub)
Things to do: -multivariate survival model with cluster calls as one of the predictors -post-hoc analysis to figure out what cluster is the most